{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "b'Skipping line 6452: expected 8 fields, saw 9\\nSkipping line 43667: expected 8 fields, saw 10\\nSkipping line 51751: expected 8 fields, saw 9\\n'\n",
      "b'Skipping line 92038: expected 8 fields, saw 9\\nSkipping line 104319: expected 8 fields, saw 9\\nSkipping line 121768: expected 8 fields, saw 9\\n'\n",
      "b'Skipping line 144058: expected 8 fields, saw 9\\nSkipping line 150789: expected 8 fields, saw 9\\nSkipping line 157128: expected 8 fields, saw 9\\nSkipping line 180189: expected 8 fields, saw 9\\nSkipping line 185738: expected 8 fields, saw 9\\n'\n",
      "b'Skipping line 209388: expected 8 fields, saw 9\\nSkipping line 220626: expected 8 fields, saw 9\\nSkipping line 227933: expected 8 fields, saw 11\\nSkipping line 228957: expected 8 fields, saw 10\\nSkipping line 245933: expected 8 fields, saw 9\\nSkipping line 251296: expected 8 fields, saw 9\\nSkipping line 259941: expected 8 fields, saw 9\\nSkipping line 261529: expected 8 fields, saw 9\\n'\n",
      "C:\\Users\\Susan\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2717: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.sparse import csr_matrix\n",
    "import sklearn\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "\n",
    "book = pd.read_csv('BX-Books.csv', sep=';', error_bad_lines=False, encoding=\"latin-1\")\n",
    "book.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']\n",
    "user = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding=\"latin-1\")\n",
    "user.columns = ['userID', 'Location', 'Age']\n",
    "rating = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding=\"latin-1\")\n",
    "rating.columns = ['userID', 'ISBN', 'bookRating']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userID</th>\n",
       "      <th>ISBN</th>\n",
       "      <th>bookRating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>276725</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>276726</td>\n",
       "      <td>0155061224</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>276727</td>\n",
       "      <td>0446520802</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>276729</td>\n",
       "      <td>052165615X</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>276729</td>\n",
       "      <td>0521795028</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userID        ISBN  bookRating\n",
       "0  276725  034545104X           0\n",
       "1  276726  0155061224           5\n",
       "2  276727  0446520802           0\n",
       "3  276729  052165615X           3\n",
       "4  276729  0521795028           6"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userID</th>\n",
       "      <th>Location</th>\n",
       "      <th>Age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>nyc, new york, usa</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>stockton, california, usa</td>\n",
       "      <td>18.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>moscow, yukon territory, russia</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>porto, v.n.gaia, portugal</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>farnborough, hants, united kingdom</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userID                            Location   Age\n",
       "0       1                  nyc, new york, usa   NaN\n",
       "1       2           stockton, california, usa  18.0\n",
       "2       3     moscow, yukon territory, russia   NaN\n",
       "3       4           porto, v.n.gaia, portugal  17.0\n",
       "4       5  farnborough, hants, united kingdom   NaN"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ISBN</th>\n",
       "      <th>bookTitle</th>\n",
       "      <th>bookAuthor</th>\n",
       "      <th>yearOfPublication</th>\n",
       "      <th>publisher</th>\n",
       "      <th>imageUrlS</th>\n",
       "      <th>imageUrlM</th>\n",
       "      <th>imageUrlL</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0195153448</td>\n",
       "      <td>Classical Mythology</td>\n",
       "      <td>Mark P. O. Morford</td>\n",
       "      <td>2002</td>\n",
       "      <td>Oxford University Press</td>\n",
       "      <td>http://images.amazon.com/images/P/0195153448.0...</td>\n",
       "      <td>http://images.amazon.com/images/P/0195153448.0...</td>\n",
       "      <td>http://images.amazon.com/images/P/0195153448.0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0002005018</td>\n",
       "      <td>Clara Callan</td>\n",
       "      <td>Richard Bruce Wright</td>\n",
       "      <td>2001</td>\n",
       "      <td>HarperFlamingo Canada</td>\n",
       "      <td>http://images.amazon.com/images/P/0002005018.0...</td>\n",
       "      <td>http://images.amazon.com/images/P/0002005018.0...</td>\n",
       "      <td>http://images.amazon.com/images/P/0002005018.0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0060973129</td>\n",
       "      <td>Decision in Normandy</td>\n",
       "      <td>Carlo D'Este</td>\n",
       "      <td>1991</td>\n",
       "      <td>HarperPerennial</td>\n",
       "      <td>http://images.amazon.com/images/P/0060973129.0...</td>\n",
       "      <td>http://images.amazon.com/images/P/0060973129.0...</td>\n",
       "      <td>http://images.amazon.com/images/P/0060973129.0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0374157065</td>\n",
       "      <td>Flu: The Story of the Great Influenza Pandemic...</td>\n",
       "      <td>Gina Bari Kolata</td>\n",
       "      <td>1999</td>\n",
       "      <td>Farrar Straus Giroux</td>\n",
       "      <td>http://images.amazon.com/images/P/0374157065.0...</td>\n",
       "      <td>http://images.amazon.com/images/P/0374157065.0...</td>\n",
       "      <td>http://images.amazon.com/images/P/0374157065.0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0393045218</td>\n",
       "      <td>The Mummies of Urumchi</td>\n",
       "      <td>E. J. W. Barber</td>\n",
       "      <td>1999</td>\n",
       "      <td>W. W. Norton &amp;amp; Company</td>\n",
       "      <td>http://images.amazon.com/images/P/0393045218.0...</td>\n",
       "      <td>http://images.amazon.com/images/P/0393045218.0...</td>\n",
       "      <td>http://images.amazon.com/images/P/0393045218.0...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         ISBN                                          bookTitle  \\\n",
       "0  0195153448                                Classical Mythology   \n",
       "1  0002005018                                       Clara Callan   \n",
       "2  0060973129                               Decision in Normandy   \n",
       "3  0374157065  Flu: The Story of the Great Influenza Pandemic...   \n",
       "4  0393045218                             The Mummies of Urumchi   \n",
       "\n",
       "             bookAuthor yearOfPublication                   publisher  \\\n",
       "0    Mark P. O. Morford              2002     Oxford University Press   \n",
       "1  Richard Bruce Wright              2001       HarperFlamingo Canada   \n",
       "2          Carlo D'Este              1991             HarperPerennial   \n",
       "3      Gina Bari Kolata              1999        Farrar Straus Giroux   \n",
       "4       E. J. W. Barber              1999  W. W. Norton &amp; Company   \n",
       "\n",
       "                                           imageUrlS  \\\n",
       "0  http://images.amazon.com/images/P/0195153448.0...   \n",
       "1  http://images.amazon.com/images/P/0002005018.0...   \n",
       "2  http://images.amazon.com/images/P/0060973129.0...   \n",
       "3  http://images.amazon.com/images/P/0374157065.0...   \n",
       "4  http://images.amazon.com/images/P/0393045218.0...   \n",
       "\n",
       "                                           imageUrlM  \\\n",
       "0  http://images.amazon.com/images/P/0195153448.0...   \n",
       "1  http://images.amazon.com/images/P/0002005018.0...   \n",
       "2  http://images.amazon.com/images/P/0060973129.0...   \n",
       "3  http://images.amazon.com/images/P/0374157065.0...   \n",
       "4  http://images.amazon.com/images/P/0393045218.0...   \n",
       "\n",
       "                                           imageUrlL  \n",
       "0  http://images.amazon.com/images/P/0195153448.0...  \n",
       "1  http://images.amazon.com/images/P/0002005018.0...  \n",
       "2  http://images.amazon.com/images/P/0060973129.0...  \n",
       "3  http://images.amazon.com/images/P/0374157065.0...  \n",
       "4  http://images.amazon.com/images/P/0393045218.0...  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "book.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userID</th>\n",
       "      <th>ISBN</th>\n",
       "      <th>bookRating</th>\n",
       "      <th>bookTitle</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>276725</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>0</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2313</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>5</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6543</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>0</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8680</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>5</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10314</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>9</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userID        ISBN  bookRating             bookTitle\n",
       "0  276725  034545104X           0  Flesh Tones: A Novel\n",
       "1    2313  034545104X           5  Flesh Tones: A Novel\n",
       "2    6543  034545104X           0  Flesh Tones: A Novel\n",
       "3    8680  034545104X           5  Flesh Tones: A Novel\n",
       "4   10314  034545104X           9  Flesh Tones: A Novel"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combine_book_rating = pd.merge(rating, book, on='ISBN')\n",
    "columns = ['yearOfPublication', 'publisher', 'bookAuthor', 'imageUrlS', 'imageUrlM', 'imageUrlL']\n",
    "combine_book_rating = combine_book_rating.drop(columns, axis=1)\n",
    "combine_book_rating.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filter to only popular books\n",
    "\n",
    "Remove rows where book title is missing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "combine_book_rating = combine_book_rating.dropna(axis = 0, subset = ['bookTitle'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bookTitle</th>\n",
       "      <th>totalRatingCount</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A Light in the Storm: The Civil War Diary of ...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Always Have Popsicles</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Apple Magic (The Collector's series)</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Ask Lily (Young Women of Faith: Lily Series, ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Beyond IBM: Leadership Marketing and Finance ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           bookTitle  totalRatingCount\n",
       "0   A Light in the Storm: The Civil War Diary of ...                 4\n",
       "1                              Always Have Popsicles                 1\n",
       "2               Apple Magic (The Collector's series)                 1\n",
       "3   Ask Lily (Young Women of Faith: Lily Series, ...                 1\n",
       "4   Beyond IBM: Leadership Marketing and Finance ...                 1"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "book_ratingCount = (combine_book_rating.\n",
    "     groupby(by = ['bookTitle'])['bookRating'].\n",
    "     count().\n",
    "     reset_index().\n",
    "     rename(columns = {'bookRating': 'totalRatingCount'})\n",
    "     [['bookTitle', 'totalRatingCount']]\n",
    "    )\n",
    "book_ratingCount.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Now we can merge the total rating count data into the rating data, giving us exactly what we need to filter out the lesser known books."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userID</th>\n",
       "      <th>ISBN</th>\n",
       "      <th>bookRating</th>\n",
       "      <th>bookTitle</th>\n",
       "      <th>totalRatingCount</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>276725</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>0</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2313</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>5</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6543</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>0</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8680</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>5</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10314</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>9</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userID        ISBN  bookRating             bookTitle  totalRatingCount\n",
       "0  276725  034545104X           0  Flesh Tones: A Novel                60\n",
       "1    2313  034545104X           5  Flesh Tones: A Novel                60\n",
       "2    6543  034545104X           0  Flesh Tones: A Novel                60\n",
       "3    8680  034545104X           5  Flesh Tones: A Novel                60\n",
       "4   10314  034545104X           9  Flesh Tones: A Novel                60"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_with_totalRatingCount = combine_book_rating.merge(book_ratingCount, left_on = 'bookTitle', right_on = 'bookTitle', how = 'left')\n",
    "rating_with_totalRatingCount.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count   241071.000\n",
      "mean         4.277\n",
      "std         16.739\n",
      "min          1.000\n",
      "25%          1.000\n",
      "50%          1.000\n",
      "75%          3.000\n",
      "max       2502.000\n",
      "Name: totalRatingCount, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "pd.set_option('display.float_format', lambda x: '%.3f' % x)\n",
    "print(book_ratingCount['totalRatingCount'].describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### The median book has only been rated one time. Let’s take a look at the top of the distribution."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.900    7.000\n",
      "0.910    8.000\n",
      "0.920    9.000\n",
      "0.930   10.000\n",
      "0.940   11.000\n",
      "0.950   13.000\n",
      "0.960   16.000\n",
      "0.970   20.000\n",
      "0.980   29.000\n",
      "0.990   50.000\n",
      "Name: totalRatingCount, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print(book_ratingCount['totalRatingCount'].quantile(np.arange(.9, 1, .01)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### So about 1% of books have 50 ratings, 2% have 29 ratings. Since we have so many books in our data, we’ll limit it to the top 1%, this will give us 2713 different books. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userID</th>\n",
       "      <th>ISBN</th>\n",
       "      <th>bookRating</th>\n",
       "      <th>bookTitle</th>\n",
       "      <th>totalRatingCount</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>276725</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>0</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2313</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>5</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6543</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>0</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8680</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>5</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10314</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>9</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userID        ISBN  bookRating             bookTitle  totalRatingCount\n",
       "0  276725  034545104X           0  Flesh Tones: A Novel                60\n",
       "1    2313  034545104X           5  Flesh Tones: A Novel                60\n",
       "2    6543  034545104X           0  Flesh Tones: A Novel                60\n",
       "3    8680  034545104X           5  Flesh Tones: A Novel                60\n",
       "4   10314  034545104X           9  Flesh Tones: A Novel                60"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "popularity_threshold = 50\n",
    "rating_popular_book = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')\n",
    "rating_popular_book.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Filtering to US users only"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userID</th>\n",
       "      <th>ISBN</th>\n",
       "      <th>bookRating</th>\n",
       "      <th>bookTitle</th>\n",
       "      <th>totalRatingCount</th>\n",
       "      <th>Location</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>276725</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>0</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "      <td>tyler, texas, usa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2313</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>5</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "      <td>cincinnati, ohio, usa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6543</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>0</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "      <td>strafford, missouri, usa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8680</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>5</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "      <td>st. charles county, missouri, usa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10314</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>9</td>\n",
       "      <td>Flesh Tones: A Novel</td>\n",
       "      <td>60</td>\n",
       "      <td>beaverton, oregon, usa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userID        ISBN  bookRating             bookTitle  totalRatingCount  \\\n",
       "0  276725  034545104X           0  Flesh Tones: A Novel                60   \n",
       "1    2313  034545104X           5  Flesh Tones: A Novel                60   \n",
       "2    6543  034545104X           0  Flesh Tones: A Novel                60   \n",
       "3    8680  034545104X           5  Flesh Tones: A Novel                60   \n",
       "4   10314  034545104X           9  Flesh Tones: A Novel                60   \n",
       "\n",
       "                            Location  \n",
       "0                  tyler, texas, usa  \n",
       "1              cincinnati, ohio, usa  \n",
       "2           strafford, missouri, usa  \n",
       "3  st. charles county, missouri, usa  \n",
       "4             beaverton, oregon, usa  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combined = rating_popular_book.merge(user, left_on = 'userID', right_on = 'userID', how = 'left')\n",
    "\n",
    "us_canada_user_rating = combined[combined['Location'].str.contains(\"usa|canada\")]\n",
    "us_canada_user_rating=us_canada_user_rating.drop('Age', axis=1)\n",
    "us_canada_user_rating.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initial dataframe shape (251615, 6)\n",
      "New dataframe shape (248949, 6)\n",
      "Removed 2666 rows\n"
     ]
    }
   ],
   "source": [
    "if not us_canada_user_rating[us_canada_user_rating.duplicated(['userID', 'bookTitle'])].empty:\n",
    "    initial_rows = us_canada_user_rating.shape[0]\n",
    "\n",
    "    print('Initial dataframe shape {0}'.format(us_canada_user_rating.shape))\n",
    "    us_canada_user_rating = us_canada_user_rating.drop_duplicates(['userID', 'bookTitle'])\n",
    "    current_rows = us_canada_user_rating.shape[0]\n",
    "    print('New dataframe shape {0}'.format(us_canada_user_rating.shape))\n",
    "    print('Removed {0} rows'.format(initial_rows - current_rows))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "us_canada_user_rating_pivot = us_canada_user_rating.pivot(index = 'bookTitle', columns = 'userID', values = 'bookRating').fillna(0)\n",
    "us_canada_user_rating_matrix = csr_matrix(us_canada_user_rating_pivot.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',\n",
       "         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.neighbors import NearestNeighbors\n",
    "\n",
    "model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')\n",
    "model_knn.fit(us_canada_user_rating_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Recommendations for The Stone Diaries:\n",
      "\n",
      "1: Daisy Fay and the Miracle Man, with distance of 0.8766136287164453:\n",
      "2: Atonement : A Novel, with distance of 0.9037760905570096:\n",
      "3: Cat's Eye, with distance of 0.9055037726842934:\n",
      "4: Blue Shoe, with distance of 0.9074036088730074:\n",
      "5: Waiting, with distance of 0.9096390816180597:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Susan\\Anaconda3\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead\n",
      "  from ipykernel import kernelapp as app\n"
     ]
    }
   ],
   "source": [
    "query_index = np.random.choice(us_canada_user_rating_pivot.shape[0])\n",
    "distances, indices = model_knn.kneighbors(us_canada_user_rating_pivot.iloc[query_index, :].reshape(1, -1), n_neighbors = 6)\n",
    "\n",
    "for i in range(0, len(distances.flatten())):\n",
    "    if i == 0:\n",
    "        print('Recommendations for {0}:\\n'.format(us_canada_user_rating_pivot.index[query_index]))\n",
    "    else:\n",
    "        print('{0}: {1}, with distance of {2}:'.format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Perfect! \"Green Mile Series\" books are definitely should be recommended one after another.   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "us_canada_user_rating_pivot2 = us_canada_user_rating.pivot(index = 'userID', columns = 'bookTitle', values = 'bookRating').fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>bookTitle</th>\n",
       "      <th>10 Lb. Penalty</th>\n",
       "      <th>16 Lighthouse Road</th>\n",
       "      <th>1984</th>\n",
       "      <th>1st to Die: A Novel</th>\n",
       "      <th>2010: Odyssey Two</th>\n",
       "      <th>204 Rosewood Lane</th>\n",
       "      <th>2061: Odyssey Three</th>\n",
       "      <th>24 Hours</th>\n",
       "      <th>2nd Chance</th>\n",
       "      <th>3rd Degree</th>\n",
       "      <th>...</th>\n",
       "      <th>YOU BELONG TO ME</th>\n",
       "      <th>Year of Wonders</th>\n",
       "      <th>You Belong To Me</th>\n",
       "      <th>You Shall Know Our Velocity</th>\n",
       "      <th>Young Wives</th>\n",
       "      <th>Zen and the Art of Motorcycle Maintenance: An Inquiry into Values</th>\n",
       "      <th>Zoya</th>\n",
       "      <th>\\O\\\" Is for Outlaw\"</th>\n",
       "      <th>\\Surely You're Joking, Mr. Feynman!\\\": Adventures of a Curious Character\"</th>\n",
       "      <th>stardust</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>userID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 2442 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "bookTitle  10 Lb. Penalty  16 Lighthouse Road  1984  1st to Die: A Novel  \\\n",
       "userID                                                                     \n",
       "8                   0.000               0.000 0.000                0.000   \n",
       "9                   0.000               0.000 0.000                0.000   \n",
       "14                  0.000               0.000 0.000                0.000   \n",
       "16                  0.000               0.000 0.000                0.000   \n",
       "17                  0.000               0.000 0.000                0.000   \n",
       "\n",
       "bookTitle  2010: Odyssey Two  204 Rosewood Lane  2061: Odyssey Three  \\\n",
       "userID                                                                 \n",
       "8                      0.000              0.000                0.000   \n",
       "9                      0.000              0.000                0.000   \n",
       "14                     0.000              0.000                0.000   \n",
       "16                     0.000              0.000                0.000   \n",
       "17                     0.000              0.000                0.000   \n",
       "\n",
       "bookTitle  24 Hours  2nd Chance  3rd Degree    ...     YOU BELONG TO ME  \\\n",
       "userID                                         ...                        \n",
       "8             0.000       0.000       0.000    ...                0.000   \n",
       "9             0.000       0.000       0.000    ...                0.000   \n",
       "14            0.000       0.000       0.000    ...                0.000   \n",
       "16            0.000       0.000       0.000    ...                0.000   \n",
       "17            0.000       0.000       0.000    ...                0.000   \n",
       "\n",
       "bookTitle  Year of Wonders  You Belong To Me  You Shall Know Our Velocity  \\\n",
       "userID                                                                      \n",
       "8                    0.000             0.000                        0.000   \n",
       "9                    0.000             0.000                        0.000   \n",
       "14                   0.000             0.000                        0.000   \n",
       "16                   0.000             0.000                        0.000   \n",
       "17                   0.000             0.000                        0.000   \n",
       "\n",
       "bookTitle  Young Wives  \\\n",
       "userID                   \n",
       "8                0.000   \n",
       "9                0.000   \n",
       "14               0.000   \n",
       "16               0.000   \n",
       "17               0.000   \n",
       "\n",
       "bookTitle  Zen and the Art of Motorcycle Maintenance: An Inquiry into Values  \\\n",
       "userID                                                                         \n",
       "8                                                      0.000                   \n",
       "9                                                      0.000                   \n",
       "14                                                     0.000                   \n",
       "16                                                     0.000                   \n",
       "17                                                     0.000                   \n",
       "\n",
       "bookTitle  Zoya  \\O\\\" Is for Outlaw\"  \\\n",
       "userID                                 \n",
       "8         0.000                0.000   \n",
       "9         0.000                0.000   \n",
       "14        0.000                0.000   \n",
       "16        0.000                0.000   \n",
       "17        0.000                0.000   \n",
       "\n",
       "bookTitle  \\Surely You're Joking, Mr. Feynman!\\\": Adventures of a Curious Character\"  \\\n",
       "userID                                                                                 \n",
       "8                                                      0.000                           \n",
       "9                                                      0.000                           \n",
       "14                                                     0.000                           \n",
       "16                                                     0.000                           \n",
       "17                                                     0.000                           \n",
       "\n",
       "bookTitle  stardust  \n",
       "userID               \n",
       "8             0.000  \n",
       "9             0.000  \n",
       "14            0.000  \n",
       "16            0.000  \n",
       "17            0.000  \n",
       "\n",
       "[5 rows x 2442 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "us_canada_user_rating_pivot2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(40017, 2442)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "us_canada_user_rating_pivot2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2442, 40017)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = us_canada_user_rating_pivot2.values.T\n",
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2442, 12)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import sklearn\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "\n",
    "SVD = TruncatedSVD(n_components=12, random_state=17)\n",
    "matrix = SVD.fit_transform(X)\n",
    "matrix.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2442, 2442)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\",category =RuntimeWarning)\n",
    "corr = np.corrcoef(matrix)\n",
    "corr.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1906\n"
     ]
    }
   ],
   "source": [
    "us_canada_book_title = us_canada_user_rating_pivot2.columns\n",
    "us_canada_book_list = list(us_canada_book_title)\n",
    "coffey_hands = us_canada_book_list.index(\"The Green Mile: Coffey's Hands (Green Mile Series)\")\n",
    "print(coffey_hands)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "corr_coffey_hands  = corr[coffey_hands]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Needful Things',\n",
       " 'The Bachman Books: Rage, the Long Walk, Roadwork, the Running Man',\n",
       " 'The Green Mile: Coffey on the Mile (Green Mile Series)',\n",
       " 'The Green Mile: Night Journey (Green Mile Series)',\n",
       " 'The Green Mile: The Bad Death of Eduard Delacroix (Green Mile Series)',\n",
       " 'The Green Mile: The Mouse on the Mile (Green Mile Series)',\n",
       " 'The Shining',\n",
       " 'The Two Dead Girls (Green Mile Series)']"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(us_canada_book_title[(corr_coffey_hands<1.0) & (corr_coffey_hands>0.9)])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The results look great!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
